# -*- coding: utf-8 -*-
import scrapy
from urllib import parse

class TxSpider(scrapy.Spider):
    name = 'tx'
    allowed_domains = ['hr.tencent.com']
    start_urls = ['https://hr.tencent.com/position.php']

    def parse(self, response):
        tr_list = response.xpath('//table[@class="tablelist"]/tr')[1:-1]
        print(len(tr_list))
        for tr in tr_list:
            item = {}
            item['职位'] = tr.xpath('.//a/text()').extract_first()
            item['职位类别'] = tr.xpath('./td[2]/text()').extract_first()
            item['人数'] = tr.xpath('./td[3]/text()').extract_first()
            item['地点'] = tr.xpath('./td[4]/text()').extract_first()
            item['发布时间'] = tr.xpath('./td[5]/text()').extract_first()
            print(item)
            yield item

        next_url = response.xpath('//a[text()="下一页"]/@href').extract_first()
        if next_url != 'javascript:;':  # 判断是否是最后一页
            # next_url = "https://hr.tencent.com/" + next_url <1>: 字符串拼接url
            # next_url = parse.urljoin(response.url, next_url)   <2>: urllib.parse的调用
            # yield scrapy.Request(
            #     next_url,
            #     callback=self.parse
            # )
            yield response.follow(next_url, callback=self.parse)
